Read data ##################
library(NHANES)
library(dplyr)
df0 <- NHANES
df <- NHANES[NHANES$Age >= 18 & NHANES$Age < 60,]
df <- df[!duplicated(df),]
Create a subset with only varaibles of interest: Age, Comorbidity1, CurrentSmoker, Depression, Fatalism, HiChol, Htn, NIHScore, Optimism, Pessimism, R_E, Sex, Spirituality
data2 <- df %>% select(
SleepHrsNight,
BMI,
Age,
Gender,
Race1,
TotChol,
BPDiaAve,
BPSysAve,
AlcoholYear,
Poverty,
DaysMentHlthBad,
UrineFlow1,
PhysActive,
DaysPhysHlthBad,
Smoke100,
HealthGen
)
“CurrentSmoker”, “HiChol”, “Htn”, “R_E”, “Sex” are binary variables, so set them as factors
data type
data2$Gender <- ifelse(data2$Gender == "male", 0, 1)
data2$Smoke100 <- ifelse(data2$Smoke100 == "No", 0, 1)
data2$PhysActive <- ifelse(data2$PhysActive == "No", 0, 1)
data2 <- data2 %>%
mutate(
Race1 = case_when(
Race1 == 'Black' ~ 1,
Race1 == 'Hispanic' ~ 2,
Race1 == 'Mexican' ~ 3,
Race1 == 'White' ~ 4,
Race1 == 'Other' ~ 5,
TRUE ~ NA_integer_ # Default value if none of the conditions are met
)
)
data2 <- data2 %>%
mutate(
HealthGen = case_when(
HealthGen == 'Poor' ~ 1,
HealthGen == 'Fair' ~ 2,
HealthGen == 'Good' ~ 3,
HealthGen == 'Vgood' ~ 4,
HealthGen == 'Excellent' ~ 5,
TRUE ~ NA_integer_ # Default value if none of the conditions are met
)
)
data2$PhysActive = as.factor(data2$PhysActive)
data2$Smoke100 = as.factor(data2$Smoke100)
data2$HealthGen = as.factor(data2$HealthGen)
data2$Race1 = as.factor(data2$Race1)
data2$Gender = as.factor(data2$Gender)
Create an indicator variable “Incomplete” (0 if all variables are complete; 1 if at least one variable is missing)
miss_num = apply(data2, 1, function(x) sum(is.na(x)))
data2$Incomplete = ifelse(miss_num>0, 1, 0)
data2$Incomplete = as.factor(data2$Incomplete)
Create two categorical variables
data2$Age_4Cat = cut(data2$Age, c(20, 40, 60, max(data2$Age, na.rm = T)), right=T, labels=c(0:2))
data2$BMI_cat = cut(data2$BMI, c( 20, 40, 60, max(data2$BMI, na.rm = T)), right=F, labels=c(0:2))
data_complete = subset(data2, data2$Incomplete == 0)
data_incomplete = subset(data2, data2$Incomplete == 1)
n1f=2710
p1=sum(data_complete$BMI)/n1f
n2f=1509
p2=sum(data_incomplete$BMI)/n2f
f_diff=p1-p2
se_f=sqrt(p1*(1-p1)/n1f+p2*(1-p2)/n2f)
z <- f_diff/se_f
p_value_f <- 2 * (1 - pnorm(abs(z)))
low_bound_diff_f <- f_diff - 1.96 * se_f
high_bound_diff_f <- f_diff + 1.96 * se_f
Continuous variables: get mean(sd) and p-values from two-sample t-tests
Prepare data
Cont_vars = c( "BMI",
"SleepHrsNight",
"Age",
"TotChol",
"BPDiaAve",
"BPSysAve",
"AlcoholYear",
"DaysPhysHlthBad",
"Poverty",
"UrineFlow1",
"DaysMentHlthBad"
)
Cont_complete = subset(data_complete, select = Cont_vars)
Cont_incomplete = subset(data_incomplete, select = Cont_vars)
Cont_incomplete_numeric <- as.data.frame(lapply(Cont_incomplete, as.numeric))
Cont_complete_numeric <- as.data.frame(lapply(Cont_complete, as.numeric))
Get mean and sd for complete cases
Cont_complete_mean = apply(Cont_complete_numeric, 2, mean)
Cont_complete_sd = apply(Cont_complete_numeric, 2, sd)
Get mean and sd for incomplete cases
Cont_incomplete_mean = apply(Cont_incomplete_numeric, 2, mean, na.rm = T)
Cont_incomplete_sd = apply(Cont_incomplete_numeric, 2, sd, na.rm = T)
Count # of complete cases for each continuous variable seperately in all complete cases dataset and incomplete cases dataset
Cont_Number_complete = apply(Cont_complete_numeric, 2, function(x) sum(complete.cases(x)))
Cont_Number_incomplete = apply(Cont_incomplete_numeric, 2, function(x) sum(complete.cases(x)))
Determine whether the variance from complete cases and variance from incomplete cases are equal or not
Cont_F_P_value = sapply(1:length(Cont_vars), function(x) var.test(Cont_complete_numeric[,x], Cont_incomplete_numeric[,x], alternative = "two.sided")$p.value>=0.05)
Get p-values from two-sample t-tests
Cont_P_value = sapply(1:length(Cont_vars), function(x) t.test(Cont_complete_numeric[,x], Cont_incomplete_numeric[,x], var.equal = Cont_F_P_value[x])$p.value)
Generate the summary table used to fill in Table1
Cont_summary = data.frame(cbind(Cont_Number_complete, Cont_complete_mean, Cont_complete_sd,
Cont_Number_incomplete, Cont_incomplete_mean, Cont_incomplete_sd,
Cont_P_value))
format(Cont_summary, nsmall = 2) ## only keep the first two decimals
Categorical/binary variables: get proportion and p-values from χ2 tests
Binary variables
Prepare data
Bi_vars = c("Gender", "Smoke100", "PhysActive")
Bi_complete = subset(data_complete, select = Bi_vars)
Bi_incomplete = subset(data_incomplete, select = Bi_vars)
Bi_data = subset(data2, select = c(Bi_vars, "Incomplete"))
Count # of complete cases for each binary variable seperately in all complete cases dataset and incomplete cases dataset
Bi_Number_complete = apply(Bi_complete, 2, function(x) sum(complete.cases(x)))
Bi_Number_incomplete = apply(Bi_incomplete, 2, function(x) sum(complete.cases(x)))
Calculate the proportions of cases with level=1 for each binary variable seperately in all complete cases dataset and incomplete cases dataset
Bi_prop_complete = sapply(1:length(Bi_vars), function(x) prop.table(table(Bi_complete[,x]))[2])
Bi_prop_incomplete = sapply(1:length(Bi_vars), function(x) prop.table(table(Bi_incomplete[,x]))[2])
Get p-values from χ2 tests
Bi_P_value = sapply(Bi_vars, function(x) chisq.test(table(Bi_data[[x]], Bi_data$Incomplete), correct = FALSE)$p.value)
Generate the summary table used to fill in Table1
Bi_summary = data.frame(cbind(Bi_Number_complete, Bi_prop_complete, Bi_Number_incomplete,
Bi_prop_incomplete, Bi_P_value))
format(Bi_summary, nsmall = 2) ## only keep the first two decimals
Categorical variables
Prepare data
Cat_vars = c("HealthGen", "Race1")
Cat_complete = subset(data_complete, select = Cat_vars)
Cat_incomplete = subset(data_incomplete, select = Cat_vars)
Cat_data = subset(data2, select = c(Cat_vars, "Incomplete"))
Count # of complete cases for each categorical variable seperately in all complete cases dataset and incomplete cases dataset
Cat_Number_complete = apply(Cat_complete, 2, function(x) sum(complete.cases(x)))
Cat_Number_incomplete = apply(Cat_incomplete, 2, function(x) sum(complete.cases(x)))
Calculate the proportions of each level for each categorical variable seperately in all complete cases dataset and incomplete cases dataset
Cat_prop_complete = t(apply(Cat_complete, 2, function(x) prop.table(table(x))))
Cat_prop_incomplete = t(apply(Cat_incomplete, 2, function(x) prop.table(table(x))))
Get p-values from χ2 tests
Cat_P_value = sapply(1:length(Cat_vars), function(x) chisq.test(table(Cat_data[[Cat_vars[x]]], Cat_data$Incomplete), correct = F)$p.value)
Generate the summary table used to fill in Table1
Cat_summary = data.frame(cbind(Cat_Number_complete, Cat_prop_complete,
Cat_Number_incomplete,
Cat_prop_incomplete, Cat_P_value), check.names = F)
format(Cat_summary, nsmall = 2) ## only keep the first two decimals
Basic characteritics ########################################
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 745375 39.9 2364357 126.3 2364357 126.3
## Vcells 1799700 13.8 8388608 64.0 6847622 52.3
set.seed(123)
library(car)
library(olsrr)
library(ggplot2)
library(lmtest)
select variables
library(NHANES)
df0 <- NHANES
df <- NHANES[NHANES$Age >= 18 & NHANES$Age < 60,]
colSums(is.na(df)) / nrow(df)
df <- df[, which(colSums(is.na(df)) / nrow(df) < 0.3)]
exclude duplication
df <- df[!duplicated(df),]
names(df)
## [1] "ID" "SurveyYr" "Gender" "Age"
## [5] "AgeDecade" "Race1" "Education" "MaritalStatus"
## [9] "HHIncome" "HHIncomeMid" "Poverty" "HomeRooms"
## [13] "HomeOwn" "Work" "Weight" "Height"
## [17] "BMI" "BMI_WHO" "Pulse" "BPSysAve"
## [21] "BPDiaAve" "BPSys1" "BPDia1" "BPSys2"
## [25] "BPDia2" "BPSys3" "BPDia3" "DirectChol"
## [29] "TotChol" "UrineVol1" "UrineFlow1" "Diabetes"
## [33] "HealthGen" "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [37] "Depressed" "SleepHrsNight" "SleepTrouble" "PhysActive"
## [41] "Alcohol12PlusYr" "AlcoholYear" "Smoke100" "Smoke100n"
## [45] "Marijuana" "RegularMarij" "HardDrugs" "SexEver"
## [49] "SexAge" "SexNumPartnLife" "SexNumPartYear" "SameSex"
## [53] "SexOrientation"
df$BPSysAve
library(dplyr)
df2 <- df %>% select(
SleepHrsNight,
BMI,
Age,
Gender,
Race1,
TotChol,
BPDiaAve,
BPSysAve,
AlcoholYear,
Poverty,
DaysMentHlthBad,
UrineFlow1,
PhysActive,
DaysPhysHlthBad,
Smoke100,
HealthGen
)
df3 <- na.omit(df2)
library(dplyr)
library(gtsummary)
library(kableExtra)
df3 <- df3 %>%
mutate(BMIcat = case_when(
BMI < 18 ~ "<18",
BMI >= 18 & BMI <= 30 ~ "18-30",
BMI > 30 ~ ">30"
),
BMIcat = factor(BMIcat, levels = c("<18", "18-30", ">30")))
table_basic <- df3 %>%
tbl_summary(
by = BMIcat,
statistic = list(all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"),
digits = list(all_continuous() ~ c(2, 2),
all_categorical() ~ c(0, 1)),
missing = "no"
) %>%
modify_header(
label = "**Variable**",
all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
) %>%
modify_caption("Participant characteristics, by BMI category") %>%
bold_labels() %>%
add_overall(col_label = "**All participants**<br>N = {N}") %>%
add_p(
test = list(
all_continuous() ~ "kruskal.test",
all_categorical() ~ "chisq.test"
),
pvalue_fun = function(x) style_pvalue(x, digits = 3)
) %>%
modify_footnote(
all_stat_cols() ~ "p-values from Kruskal-Wallis rank sum test"
)
## Warning for variable 'Race1':
## simpleWarning in stats::chisq.test(x = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 2L, : Chi-squared approximation may be incorrect
## Warning for variable 'HealthGen':
## simpleWarning in stats::chisq.test(x = structure(c(3L, 2L, 4L, 3L, 4L, 3L, 3L, : Chi-squared approximation may be incorrect
View the table
# print(table_basic)
Assuming ‘table1’ is already created using gtsummary
Convert to a kableExtra object
kable_table <- as_kable(table_basic)
Assuming ‘kable_table’ is already created using gtsummary
Convert to a kableExtra object
latex_table <- kable_table %>%
kable_styling(latex_options = c("striped", "hold_position"))
Save the LaTeX table to file
save_kable(file = "table_for_overleaf.tex", latex_table)
MLR & Diagnosis no log ########################################
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1133413 60.6 2364357 126.3 2364357 126.3
## Vcells 2446264 18.7 8388608 64.0 6847622 52.3
set.seed(123)
library(car)
library(olsrr)
library(ggplot2)
library(lmtest)
select variables
library(NHANES)
df0 <- NHANES
df <- NHANES[NHANES$Age >= 18 & NHANES$Age < 60,]
colSums(is.na(df)) / nrow(df)
df <- df[, which(colSums(is.na(df)) / nrow(df) < 0.3)]
exclude duplication
df <- df[!duplicated(df),]
names(df)
## [1] "ID" "SurveyYr" "Gender" "Age"
## [5] "AgeDecade" "Race1" "Education" "MaritalStatus"
## [9] "HHIncome" "HHIncomeMid" "Poverty" "HomeRooms"
## [13] "HomeOwn" "Work" "Weight" "Height"
## [17] "BMI" "BMI_WHO" "Pulse" "BPSysAve"
## [21] "BPDiaAve" "BPSys1" "BPDia1" "BPSys2"
## [25] "BPDia2" "BPSys3" "BPDia3" "DirectChol"
## [29] "TotChol" "UrineVol1" "UrineFlow1" "Diabetes"
## [33] "HealthGen" "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [37] "Depressed" "SleepHrsNight" "SleepTrouble" "PhysActive"
## [41] "Alcohol12PlusYr" "AlcoholYear" "Smoke100" "Smoke100n"
## [45] "Marijuana" "RegularMarij" "HardDrugs" "SexEver"
## [49] "SexAge" "SexNumPartnLife" "SexNumPartYear" "SameSex"
## [53] "SexOrientation"
df$BPSysAve
library(dplyr)
df2 <- df %>% select(
SleepHrsNight,
BMI,
Age,
Gender,
Race1,
TotChol,
BPDiaAve,
BPSysAve,
AlcoholYear,
Poverty,
DaysMentHlthBad,
UrineFlow1,
PhysActive,
DaysPhysHlthBad,
Smoke100,
HealthGen
)
df3 <- na.omit(df2)
df3$SleepHrsNight <- df3$SleepHrsNight * 60
df3 <- df3[, -which(names(df3) %in% “SleepHrsNight”)]
cor(df3$BPSysAve,df3$BPDiaAve)
psych::describe(df3)
psych::pairs.panels(df3)
hist(df3$SleepHrsNight)
colSums(is.na(df2)) / nrow(df2)
fit0 <-
lm(SleepHrsNight ~ .,
data = df3)
data type
df3$Gender <- ifelse(df3$Gender == "male", 0, 1)
df3$Smoke100 <- ifelse(df3$Smoke100 == "No", 0, 1)
df3$PhysActive <- ifelse(df3$PhysActive == "No", 0, 1)
df3 <- df3 %>%
mutate(
Race1 = case_when(
Race1 == 'Black' ~ 1,
Race1 == 'Hispanic' ~ 2,
Race1 == 'Mexican' ~ 3,
Race1 == 'White' ~ 4,
Race1 == 'Other' ~ 5,
TRUE ~ NA_integer_ # Default value if none of the conditions are met
)
)
df3 <- df3 %>%
mutate(
HealthGen = case_when(
HealthGen == 'Poor' ~ 1,
HealthGen == 'Fair' ~ 2,
HealthGen == 'Good' ~ 3,
HealthGen == 'Vgood' ~ 4,
HealthGen == 'Excellent' ~ 5,
TRUE ~ NA_integer_ # Default value if none of the conditions are met
)
)
model_3 add additional risk factors ##
m_3 = lm(
BMI ~ SleepHrsNight + Age + Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
DaysPhysHlthBad + factor(HealthGen) + PhysActive,
df3
)
summary(m_3)
##
## Call:
## lm(formula = BMI ~ SleepHrsNight + Age + Gender + factor(Race1) +
## Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 +
## UrineFlow1 + DaysMentHlthBad + DaysPhysHlthBad + factor(HealthGen) +
## PhysActive, data = df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.617 -4.039 -0.661 3.256 35.707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.324898 1.832007 14.915 < 2e-16 ***
## SleepHrsNight -0.117162 0.103681 -1.130 0.258587
## Age 0.003943 0.013319 0.296 0.767244
## Gender 0.407763 0.280402 1.454 0.146030
## factor(Race1)2 -1.940769 0.619152 -3.135 0.001744 **
## factor(Race1)3 -1.315083 0.542230 -2.425 0.015374 *
## factor(Race1)4 -1.510239 0.410939 -3.675 0.000243 ***
## factor(Race1)5 -3.041375 0.605088 -5.026 5.40e-07 ***
## Poverty 0.056116 0.088919 0.631 0.528047
## TotChol 0.008566 0.131873 0.065 0.948215
## BPDiaAve 0.061458 0.013402 4.586 4.77e-06 ***
## BPSysAve 0.046897 0.011244 4.171 3.15e-05 ***
## AlcoholYear -0.008133 0.001470 -5.532 3.55e-08 ***
## Smoke100 -0.912564 0.280827 -3.250 0.001173 **
## UrineFlow1 -0.086054 0.138360 -0.622 0.534033
## DaysMentHlthBad -0.037214 0.017514 -2.125 0.033716 *
## DaysPhysHlthBad 0.019369 0.020325 0.953 0.340703
## factor(HealthGen)2 -2.921984 0.974038 -3.000 0.002731 **
## factor(HealthGen)3 -4.504202 0.965175 -4.667 3.24e-06 ***
## factor(HealthGen)4 -6.258420 0.990705 -6.317 3.21e-10 ***
## factor(HealthGen)5 -8.167162 1.044199 -7.821 8.00e-15 ***
## PhysActive -0.778610 0.286658 -2.716 0.006655 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.21 on 2225 degrees of freedom
## Multiple R-squared: 0.1534, Adjusted R-squared: 0.1454
## F-statistic: 19.2 on 21 and 2225 DF, p-value: < 2.2e-16
car::Anova(m_3, type = "III")
model 3 diagnosis ###########
par(mfrow = c(2, 3)) #read more from ?plot.lm
plot(m_3, which = 1)
plot(m_3, which = 2)
plot(m_3, which = 3)
plot(m_3, which = 4)
plot(m_3, which = 5)
plot(m_3, which = 6)
par(mfrow = c(1, 1)) # reset
m_3.yhat = m_3$fitted.values
m_3.res = m_3$residuals
m_3.h = hatvalues(m_3)
m_3.r = rstandard(m_3)
m_3.rr = rstudent(m_3)
which subject is most outlying with respect to the x space
Hmisc::describe(m_3.h)
## m_3.h
## n missing distinct Info Mean Gmd .05 .10
## 2247 0 2247 1 0.009791 0.005107 0.004459 0.004936
## .25 .50 .75 .90 .95
## 0.006261 0.008725 0.011708 0.015200 0.018498
##
## lowest : 0.002814313 0.002982653 0.003117284 0.003208262 0.003227126
## highest: 0.038012401 0.042497096 0.048833386 0.048889933 0.055461319
m_3.h[which.max(m_3.h)]
## 422
## 0.05546132
Assumption:LINE ##############################
(1)Linear: 2 approaches
partial regression plots
car::avPlots(m_3)
(2)Independence:
residuals <- resid(m_3)
acf(residuals, main = "Autocorrelation Function of Residuals")
pacf(residuals, main = "Partial Autocorrelation Function of Residuals")
dw_test <- dwtest(m_3)
print(dw_test)
##
## Durbin-Watson test
##
## data: m_3
## DW = 2.0074, p-value = 0.5688
## alternative hypothesis: true autocorrelation is greater than 0
(3)E: constant var: residuals-fitted values; transform for variance-stable…(total: 4 solutions)
car::residualPlots(m_3, type = "response")
## Test stat Pr(>|Test stat|)
## SleepHrsNight -0.4976 0.6188065
## Age -3.8434 0.0001247 ***
## Gender -0.6793 0.4970226
## factor(Race1)
## Poverty -1.1801 0.2380940
## TotChol -1.5798 0.1142852
## BPDiaAve 0.5367 0.5915115
## BPSysAve -3.6070 0.0003165 ***
## AlcoholYear 1.9234 0.0545555 .
## Smoke100 0.0234 0.9813551
## UrineFlow1 -0.3883 0.6978492
## DaysMentHlthBad -0.3539 0.7234147
## DaysPhysHlthBad 0.5682 0.5699655
## factor(HealthGen)
## PhysActive -0.2079 0.8353039
## Tukey test 0.4000 0.6891547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(m_3, which = 1)
or
ggplot(m_3, aes(x = m_3.yhat, y = m_3.res)) +
geom_point(color = "blue", alpha = 0.8) +
geom_hline(yintercept = 0,
linetype = "dashed",
color = "black") +
labs(title = "constant variance assumption",
x = "y hat",
y = "Residuals") +
theme_minimal()
conclusion: the constant variance assumption is basically not violated. The spread of the residuals appears to be fairly uniform across the range of predicted values, the assumption is more likely to hold
(4)Normality: residuals freq - residuals (4 plots: his, box, Q-Q, shapiro); transform
exam quartiles of the residuals
Hmisc::describe(m_3.res)
## m_3.res
## n missing distinct Info Mean Gmd .05 .10
## 2247 0 2247 1 8.085e-17 6.67 -8.6246 -6.9209
## .25 .50 .75 .90 .95
## -4.0390 -0.6607 3.2560 7.4840 10.6204
##
## lowest : -16.61655 -14.99224 -14.93896 -13.77626 -13.40159
## highest: 28.87262 31.61046 33.88930 33.95985 35.70682
Hmisc::describe(m_3.res)$counts[c(".25", ".50", ".75")] #not symmetric
## .25 .50 .75
## "-4.0390" "-0.6607" " 3.2560"
histogram
par(mfrow = c(1, 1))
hist(m_3.res, breaks = 15)
Q-Q plot
qq.m_3.res = car::qqPlot(m_3.res)
m_3.res[qq.m_3.res]
## 1943 910
## 35.70682 33.95985
influential observations #################
influence2 = data.frame(
Residual = resid(m_3),
Rstudent = rstudent(m_3),
HatDiagH = hat(model.matrix(m_3)),
CovRatio = covratio(m_3),
DFFITS = dffits(m_3),
COOKsDistance = cooks.distance(m_3)
)
DFFITS
ols_plot_dffits(m_3)
influence2[order(abs(influence2$DFFITS), decreasing = T), ] %>% head()
From the plot above, we can see 2 observations with the largest (magnitude) of DFFITS, observation 879 and 1769 By printing the corresponding values of DFFITS in the output dataset, we can obtain their DFFITS values: 0.5673 for observation 879, 0.5872 for observation 1769
Cook’s D
ols_plot_cooksd_bar(m_3)
influence2[order(influence2$COOKsDistance, decreasing = T), ] %>% head()
From the plot above, we can see that the observation 879 and 1769 also have the largest Cook’s Distance. By printing the corresponding values of Cook’s D in the output dataset, we can obtain their Cook’s D values:0.0108 for observation 879, 0.0145 for observation 1769
leverage
ols_plot_resid_lev(m_3)
high leverage
influence2[order(influence2$HatDiagH, decreasing = T), ] %>% head()
high studentized residual
influence2[order(influence2$Rstudent, decreasing = T), ] %>% head()
From the plot above, we can see that the observation 1155 has the largest leverage (0.0368). Observations 1862 has the largest (in magnitude) externally studentized residual (5.9649).
From the plot above, there is 7 observations(1048,1769,1684, 74, 72, 1689, 1311) located in the intersection areas of both outlier and leverage, which is to say, those observations has both the leverage and the externally studentized residual exceeding their respective thresholds.Due to its large DIFFITS and Cook’s D, they are potentially influential observations.
The thresholds for the externally studentized residual are -2 and 2, i.e. 2 in magnitude. The thresholds for the leverage of the R default is 0.011
From (DFFITS), observations 879 and 1769 appear to be influential observations. Observation 1155 has extraordinarily large leverage. Therefore, I choose to remove these 14 observations in the re-fitted mode
rm3.df3 = df3[-c(170, 208, 444, 926, 1361, 1454, 1546, 1655, 1910, 1958), ]
rm.m_3 = lm(
BMI ~ SleepHrsNight + Age + Gender + Race1 + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
DaysPhysHlthBad + HealthGen + PhysActive,
rm3.df3
)
Before removing these observations, the estimated coefficients are:
summary(m_3)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.324897955 1.832007232 14.91527843 5.063682e-48
## SleepHrsNight -0.117162147 0.103680975 -1.13002551 2.585873e-01
## Age 0.003942574 0.013318707 0.29601778 7.672441e-01
## Gender 0.407763087 0.280402482 1.45420641 1.460301e-01
## factor(Race1)2 -1.940768951 0.619151591 -3.13456184 1.743558e-03
## factor(Race1)3 -1.315082879 0.542229766 -2.42532402 1.537382e-02
## factor(Race1)4 -1.510238608 0.410939027 -3.67509170 2.433861e-04
## factor(Race1)5 -3.041375233 0.605088431 -5.02633182 5.398260e-07
## Poverty 0.056115667 0.088918729 0.63108940 5.280469e-01
## TotChol 0.008565966 0.131872653 0.06495635 9.482146e-01
## BPDiaAve 0.061457725 0.013401526 4.58587505 4.771754e-06
## BPSysAve 0.046897025 0.011243719 4.17095295 3.149657e-05
## AlcoholYear -0.008132748 0.001470236 -5.53159289 3.545838e-08
## Smoke100 -0.912563541 0.280827055 -3.24955707 1.173076e-03
## UrineFlow1 -0.086054222 0.138360108 -0.62195833 5.340330e-01
## DaysMentHlthBad -0.037213746 0.017514270 -2.12476717 3.371600e-02
## DaysPhysHlthBad 0.019369197 0.020324834 0.95298182 3.407028e-01
## factor(HealthGen)2 -2.921983768 0.974038001 -2.99986629 2.730955e-03
## factor(HealthGen)3 -4.504202045 0.965175428 -4.66671852 3.242377e-06
## factor(HealthGen)4 -6.258419551 0.990705138 -6.31713646 3.208005e-10
## factor(HealthGen)5 -8.167161763 1.044199344 -7.82145843 7.997961e-15
## PhysActive -0.778610338 0.286657981 -2.71616487 6.655396e-03
After removing these observations, the estimated coefficients are:
summary(rm.m_3)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.931772229 1.610013332 17.34878319 2.493565e-63
## SleepHrsNight -0.149306764 0.102448360 -1.45738559 1.451513e-01
## Age 0.007538154 0.013092151 0.57577659 5.648245e-01
## Gender 0.437837920 0.275071652 1.59172316 1.115892e-01
## Race1 -0.494790939 0.116629002 -4.24243481 2.301832e-05
## Poverty 0.080793680 0.086489789 0.93414125 3.503326e-01
## TotChol 0.007081371 0.129713646 0.05459233 9.564682e-01
## BPDiaAve 0.066739760 0.013403431 4.97930420 6.872480e-07
## BPSysAve 0.043740703 0.011169838 3.91596587 9.275883e-05
## AlcoholYear -0.007609936 0.001445499 -5.26457270 1.540361e-07
## Smoke100 -0.824668424 0.275771138 -2.99040875 2.816661e-03
## UrineFlow1 -0.094603489 0.136349841 -0.69382911 4.878619e-01
## DaysMentHlthBad -0.040870114 0.017274127 -2.36597271 1.806818e-02
## DaysPhysHlthBad 0.015732946 0.019579489 0.80354219 4.217474e-01
## HealthGen -1.674247750 0.156736230 -10.68194477 5.230180e-26
## PhysActive -0.749311170 0.282521486 -2.65222720 8.053167e-03
change percent
abs((rm.m_3$coefficients - m_3$coefficients) / (m_3$coefficients) * 100)
## (Intercept) SleepHrsNight Age Gender
## 2.220957e+00 2.743601e+01 9.119879e+01 7.375565e+00
## factor(Race1)2 factor(Race1)3 factor(Race1)4 factor(Race1)5
## 7.450542e+01 1.061436e+02 1.004689e+02 1.021944e+02
## Poverty TotChol BPDiaAve BPSysAve
## 2.205260e+01 1.888392e+02 1.441847e+03 3.017260e+02
## AlcoholYear Smoke100 UrineFlow1 DaysMentHlthBad
## 4.025376e+02 1.017240e+02 1.845573e+03 1.913533e+03
## DaysPhysHlthBad factor(HealthGen)2 factor(HealthGen)3 factor(HealthGen)4
## 1.441072e+05 9.489023e+01 1.001674e+02 1.069960e+02
## factor(HealthGen)5 PhysActive
## 9.394170e+01 1.103767e+02
The estimated regression coefficients doesn’t change slightly after removing these observations. 5 of the estimates have changed by more than 10% after calculation. The p-value for the coefficient forSleepHrsNight is becoming insignificant with 95% confidence level.
multicollinearity ######################
Pearson correlations
var3 = c(
"BMI",
"SleepHrsNight",
"Age",
"Gender",
"Race1",
"TotChol",
"BPDiaAve",
"BPSysAve",
"AlcoholYear",
"Smoke100",
"DaysPhysHlthBad",
"PhysActive",
"Poverty",
"UrineFlow1",
"DaysMentHlthBad",
"HealthGen"
)
newData3 = df3[, var3]
library("corrplot")
## corrplot 0.92 loaded
par(mfrow = c(1, 2))
cormat3 = cor(as.matrix(newData3[, -c(1)], method = "pearson"))
p.mat3 = cor.mtest(as.matrix(newData3[, -c(1)]))$p
corrplot(
cormat3,
method = "color",
type = "upper",
number.cex = 1,
diag = FALSE,
addCoef.col = "black",
tl.col = "black",
tl.srt = 90,
p.mat = p.mat3,
sig.level = 0.05,
insig = "blank",
)
None of the covariates seem strongly correlated.There is no evidence of collinearity from the pair-wise correlations.
collinearity diagnostics (VIF)
car::vif(m_3)
## GVIF Df GVIF^(1/(2*Df))
## SleepHrsNight 1.069282 1 1.034061
## Age 1.342254 1 1.158557
## Gender 1.140187 1 1.067795
## factor(Race1) 1.245248 4 1.027796
## Poverty 1.321502 1 1.149566
## TotChol 1.130821 1 1.063400
## BPDiaAve 1.445271 1 1.202194
## BPSysAve 1.562024 1 1.249810
## AlcoholYear 1.122271 1 1.059373
## Smoke100 1.141210 1 1.068274
## UrineFlow1 1.045366 1 1.022432
## DaysMentHlthBad 1.144347 1 1.069742
## DaysPhysHlthBad 1.247355 1 1.116851
## factor(HealthGen) 1.455605 4 1.048046
## PhysActive 1.166136 1 1.079878
From the VIF values in the output above, once again we do not observe any potential collinearity issues. In fact, the VIF values are fairly small: none of the values exceed 10.
MLR & Diagnosis log ########################################
model_3 add additional risk factors ##
df3$logBMI = log(df3$BMI + 1)
m_3 = lm(
logBMI ~ SleepHrsNight + DaysMentHlthBad+Age + Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 +
DaysPhysHlthBad + factor(HealthGen) + PhysActive,
df3
)
summary(m_3)
##
## Call:
## lm(formula = logBMI ~ SleepHrsNight + DaysMentHlthBad + Age +
## Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve +
## AlcoholYear + Smoke100 + UrineFlow1 + DaysPhysHlthBad + factor(HealthGen) +
## PhysActive, data = df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.61804 -0.12733 -0.00485 0.12226 0.78086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.278e+00 5.765e-02 56.863 < 2e-16 ***
## SleepHrsNight -4.718e-03 3.263e-03 -1.446 0.148321
## DaysMentHlthBad -1.265e-03 5.512e-04 -2.295 0.021825 *
## Age 4.023e-04 4.192e-04 0.960 0.337200
## Gender 1.807e-03 8.824e-03 0.205 0.837761
## factor(Race1)2 -4.739e-02 1.949e-02 -2.432 0.015084 *
## factor(Race1)3 -2.532e-02 1.706e-02 -1.484 0.138009
## factor(Race1)4 -4.245e-02 1.293e-02 -3.282 0.001045 **
## factor(Race1)5 -9.451e-02 1.904e-02 -4.963 7.45e-07 ***
## Poverty 2.848e-03 2.798e-03 1.018 0.308886
## TotChol 3.721e-03 4.150e-03 0.897 0.370054
## BPDiaAve 1.961e-03 4.217e-04 4.649 3.53e-06 ***
## BPSysAve 1.532e-03 3.539e-04 4.331 1.55e-05 ***
## AlcoholYear -2.676e-04 4.627e-05 -5.784 8.33e-09 ***
## Smoke100 -2.991e-02 8.838e-03 -3.384 0.000726 ***
## UrineFlow1 -2.890e-03 4.354e-03 -0.664 0.506924
## DaysPhysHlthBad 6.034e-04 6.396e-04 0.943 0.345644
## factor(HealthGen)2 -8.521e-02 3.065e-02 -2.780 0.005486 **
## factor(HealthGen)3 -1.264e-01 3.037e-02 -4.163 3.27e-05 ***
## factor(HealthGen)4 -1.802e-01 3.118e-02 -5.781 8.48e-09 ***
## factor(HealthGen)5 -2.469e-01 3.286e-02 -7.512 8.36e-14 ***
## PhysActive -2.262e-02 9.021e-03 -2.507 0.012232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1954 on 2225 degrees of freedom
## Multiple R-squared: 0.1547, Adjusted R-squared: 0.1467
## F-statistic: 19.39 on 21 and 2225 DF, p-value: < 2.2e-16
car::Anova(m_3, type = "III")
model 3 diagnosis ###########
par(mfrow = c(2, 3)) #read more from ?plot.lm
plot(m_3, which = 1)
plot(m_3, which = 2)
plot(m_3, which = 3)
plot(m_3, which = 4)
plot(m_3, which = 5)
plot(m_3, which = 6)
par(mfrow = c(1, 1)) # reset
m_3.yhat = m_3$fitted.values
m_3.res = m_3$residuals
m_3.h = hatvalues(m_3)
m_3.r = rstandard(m_3)
m_3.rr = rstudent(m_3)
which subject is most outlying with respect to the x space
Hmisc::describe(m_3.h)
## m_3.h
## n missing distinct Info Mean Gmd .05 .10
## 2247 0 2247 1 0.009791 0.005107 0.004459 0.004936
## .25 .50 .75 .90 .95
## 0.006261 0.008725 0.011708 0.015200 0.018498
##
## lowest : 0.002814313 0.002982653 0.003117284 0.003208262 0.003227126
## highest: 0.038012401 0.042497096 0.048833386 0.048889933 0.055461319
m_3.h[which.max(m_3.h)]
## 422
## 0.05546132
Assumption:LINE ##############################
(1)Linear: 2 approaches
partial regression plots
car::avPlots(m_3)
(2)Independence:
residuals <- resid(m_3)
acf(residuals, main = "Autocorrelation Function of Residuals")
pacf(residuals, main = "Partial Autocorrelation Function of Residuals")
dw_test <- dwtest(m_3)
print(dw_test)
##
## Durbin-Watson test
##
## data: m_3
## DW = 2.0148, p-value = 0.6364
## alternative hypothesis: true autocorrelation is greater than 0
(3)E: constant var: residuals-fitted values; transform for variance-stable…(total: 4 solutions)
car::residualPlots(m_3, type = "response")
## Test stat Pr(>|Test stat|)
## SleepHrsNight -0.7992 0.42429
## DaysMentHlthBad -0.2089 0.83458
## Age -4.0747 4.769e-05 ***
## Gender -0.3071 0.75882
## factor(Race1)
## Poverty -0.9680 0.33315
## TotChol -1.8454 0.06511 .
## BPDiaAve 0.5158 0.60601
## BPSysAve -4.3661 1.323e-05 ***
## AlcoholYear 1.9986 0.04577 *
## Smoke100 0.5060 0.61289
## UrineFlow1 -0.2751 0.78326
## DaysPhysHlthBad 0.7872 0.43126
## factor(HealthGen)
## PhysActive 0.0674 0.94625
## Tukey test -1.8323 0.06691 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(m_3, which = 1)
or
ggplot(m_3, aes(x = m_3.yhat, y = m_3.res)) +
geom_point(color = "blue", alpha = 0.8) +
geom_hline(yintercept = 0,
linetype = "dashed",
color = "black") +
labs(title = "constant variance assumption",
x = "y hat",
y = "Residuals") +
theme_minimal()
conclusion: the constant variance assumption is basically not violated. The spread of the residuals appears to be fairly uniform across the range of predicted values, the assumption is more likely to hold
(4)Normality: residuals freq - residuals (4 plots: his, box, Q-Q, shapiro); transform
exam quartiles of the residuals
Hmisc::describe(m_3.res)
## m_3.res
## n missing distinct Info Mean Gmd .05 .10
## 2247 0 2247 1 2.528e-18 0.2172 -0.30704 -0.23984
## .25 .50 .75 .90 .95
## -0.12733 -0.00485 0.12226 0.24533 0.32858
##
## lowest : -0.6180372 -0.5974308 -0.5951820 -0.5507425 -0.5295147
## highest: 0.7040679 0.7130134 0.7234443 0.7624020 0.7808639
Hmisc::describe(m_3.res)$counts[c(".25", ".50", ".75")] #not symmetric
## .25 .50 .75
## "-0.12733" "-0.00485" " 0.12226"
histogram
par(mfrow = c(1, 1))
hist(m_3.res, breaks = 15)
Q-Q plot
qq.m_3.res = car::qqPlot(m_3.res)
m_3.res[qq.m_3.res]
## 910 1943
## 0.7808639 0.7624020
influential observations #################
influence3 = data.frame(
Residual = resid(m_3),
Rstudent = rstudent(m_3),
HatDiagH = hat(model.matrix(m_3)),
CovRatio = covratio(m_3),
DFFITS = dffits(m_3),
COOKsDistance = cooks.distance(m_3)
)
DFFITS
ols_plot_dffits(m_3)
influence3[order(abs(influence3$DFFITS), decreasing = T), ] %>% head()
From the plot above, we can see 2 observations with the largest (magnitude) of DFFITS, observation 879 and 1769 By printing the corresponding values of DFFITS in the output dataset, we can obtain their DFFITS values: 0.5673 for observation 879, 0.5872 for observation 1769
Cook’s D
ols_plot_cooksd_bar(m_3)
influence3[order(influence3$COOKsDistance, decreasing = T), ] %>% head()
From the plot above, we can see that the observation 879 and 1769 also have the largest Cook’s Distance. By printing the corresponding values of Cook’s D in the output dataset, we can obtain their Cook’s D values:0.0108 for observation 879, 0.0145 for observation 1769
leverage
ols_plot_resid_lev(m_3)
high leverage
influence3[order(influence3$HatDiagH, decreasing = T), ] %>% head()
high studentized residual
influence3[order(influence3$Rstudent, decreasing = T), ] %>% head()
From the plot above, we can see that the observation 1155 has the largest leverage (0.0368). Observations 1862 has the largest (in magnitude) externally studentized residual (5.9649).
From the plot above, there is 7 observations(1048,1769,1684, 74, 72, 1689, 1311) located in the intersection areas of both outlier and leverage, which is to say, those observations has both the leverage and the externally studentized residual exceeding their respective thresholds.Due to its large DIFFITS and Cook’s D, they are potentially influential observations.
The thresholds for the externally studentized residual are -2 and 2, i.e. 2 in magnitude. The thresholds for the leverage of the R default is 0.011
From (DFFITS), observations 879 and 1769 appear to be influential observations. Observation 1155 has extraordinarily large leverage. Therefore, I choose to remove these 14 observations in the re-fitted mode
rm3.df3 = df3[-c(444, 926, 1348, 1454, 1655, 1910, 1958), ]
rm.m_3 = lm(
logBMI ~ SleepHrsNight + Age + Gender + Race1 + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
DaysPhysHlthBad + HealthGen + PhysActive,
rm3.df3
)
Before removing these observations, the estimated coefficients are:
summary(m_3)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2784100219 0.0576543432 56.8631926 0.000000e+00
## SleepHrsNight -0.0047181234 0.0032629012 -1.4459903 1.483208e-01
## DaysMentHlthBad -0.0012649713 0.0005511844 -2.2950057 2.182547e-02
## Age 0.0004023475 0.0004191475 0.9599186 3.372005e-01
## Gender 0.0018070868 0.0088244307 0.2047823 8.377610e-01
## factor(Race1)2 -0.0473922190 0.0194850641 -2.4322332 1.508442e-02
## factor(Race1)3 -0.0253196710 0.0170642891 -1.4837812 1.380086e-01
## factor(Race1)4 -0.0424502990 0.0129324924 -3.2824530 1.045004e-03
## factor(Race1)5 -0.0945148238 0.0190424882 -4.9633652 7.453716e-07
## Poverty 0.0028481217 0.0027983246 1.0177953 3.088859e-01
## TotChol 0.0037207960 0.0041501098 0.8965536 3.700541e-01
## BPDiaAve 0.0019606291 0.0004217539 4.6487515 3.534993e-06
## BPSysAve 0.0015324057 0.0003538465 4.3307081 1.552264e-05
## AlcoholYear -0.0002676120 0.0000462692 -5.7838049 8.332325e-09
## Smoke100 -0.0299111317 0.0088377923 -3.3844574 7.255600e-04
## UrineFlow1 -0.0028901130 0.0043542738 -0.6637417 5.069244e-01
## DaysPhysHlthBad 0.0006033501 0.0006396345 0.9432733 3.456435e-01
## factor(HealthGen)2 -0.0852089875 0.0306535477 -2.7797431 5.485942e-03
## factor(HealthGen)3 -0.1264388994 0.0303746374 -4.1626472 3.265555e-05
## factor(HealthGen)4 -0.1802336838 0.0311780723 -5.7807834 8.481117e-09
## factor(HealthGen)5 -0.2468663127 0.0328615664 -7.5123112 8.362784e-14
## PhysActive -0.0226201539 0.0090212949 -2.5074176 1.223242e-02
After removing these observations, the estimated coefficients are:
summary(rm.m_3)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3050560522 5.102395e-02 64.7746013 0.000000e+00
## SleepHrsNight -0.0047524563 3.249225e-03 -1.4626432 1.437063e-01
## Age 0.0004216798 4.153499e-04 1.0152398 3.101019e-01
## Gender 0.0032635982 8.718949e-03 0.3743110 7.082087e-01
## Race1 -0.0153799757 3.699991e-03 -4.1567606 3.350196e-05
## Poverty 0.0033436926 2.742497e-03 1.2192148 2.228920e-01
## TotChol 0.0038474226 4.117724e-03 0.9343565 3.502215e-01
## BPDiaAve 0.0021017603 4.252063e-04 4.9429193 8.270640e-07
## BPSysAve 0.0015086789 3.531594e-04 4.2719493 2.019304e-05
## AlcoholYear -0.0002568527 4.589462e-05 -5.5965757 2.455559e-08
## Smoke100 -0.0271010352 8.750036e-03 -3.0972485 1.977648e-03
## UrineFlow1 -0.0031016415 4.327971e-03 -0.7166502 4.736652e-01
## DaysMentHlthBad -0.0013536816 5.475215e-04 -2.4723807 1.349595e-02
## DaysPhysHlthBad 0.0006020118 6.186968e-04 0.9730321 3.306432e-01
## HealthGen -0.0521366128 4.975551e-03 -10.4785605 4.123413e-25
## PhysActive -0.0223651970 8.959716e-03 -2.4961948 1.262525e-02
change percent
abs((rm.m_3$coefficients - m_3$coefficients) / (m_3$coefficients) * 100)
## (Intercept) SleepHrsNight DaysMentHlthBad Age
## 8.127730e-01 7.276813e-01 1.333351e+02 7.111391e+02
## Gender factor(Race1)2 factor(Race1)3 factor(Race1)4
## 9.510922e+02 1.070554e+02 1.151954e+02 1.049511e+02
## factor(Race1)5 Poverty TotChol BPDiaAve
## 1.015962e+02 1.090183e+02 8.283666e+02 2.581962e+02
## BPSysAve AlcoholYear Smoke100 UrineFlow1
## 1.883370e+02 3.249570e+02 7.430505e+01 6.738520e+02
## DaysPhysHlthBad factor(HealthGen)2 factor(HealthGen)3 factor(HealthGen)4
## 5.476841e+05 9.442259e+01 1.003335e+02 1.018108e+02
## factor(HealthGen)5 PhysActive
## 9.376992e+01 1.147819e+02
The estimated regression coefficients doesn’t change slightly after removing these observations. 5 of the estimates have changed by more than 10% after calculation. The p-value for the coefficient forSleepHrsNight is becoming insignificant with 95% confidence level.
multicollinearity ######################
Pearson correlations
var3 = c(
"BMI",
"SleepHrsNight",
"Age",
"Gender",
"Race1",
"TotChol",
"BPDiaAve",
"BPSysAve",
"AlcoholYear",
"Smoke100",
"DaysPhysHlthBad",
"PhysActive",
"Poverty",
"UrineFlow1",
"DaysMentHlthBad",
"HealthGen"
)
newData3 = df3[, var3]
library("corrplot")
par(mfrow = c(1, 2))
cormat3 = cor(as.matrix(newData3[, -c(1)], method = "pearson"))
p.mat3 = cor.mtest(as.matrix(newData3[, -c(1)]))$p
corrplot(
cormat3,
method = "color",
type = "upper",
number.cex = 1,
diag = FALSE,
addCoef.col = "black",
tl.col = "black",
tl.srt = 90,
p.mat = p.mat3,
sig.level = 0.05,
insig = "blank",
)
None of the covariates seem strongly correlated.There is no evidence of collinearity from the pair-wise correlations.
collinearity diagnostics (VIF)
car::vif(m_3)
## GVIF Df GVIF^(1/(2*Df))
## SleepHrsNight 1.069282 1 1.034061
## DaysMentHlthBad 1.144347 1 1.069742
## Age 1.342254 1 1.158557
## Gender 1.140187 1 1.067795
## factor(Race1) 1.245248 4 1.027796
## Poverty 1.321502 1 1.149566
## TotChol 1.130821 1 1.063400
## BPDiaAve 1.445271 1 1.202194
## BPSysAve 1.562024 1 1.249810
## AlcoholYear 1.122271 1 1.059373
## Smoke100 1.141210 1 1.068274
## UrineFlow1 1.045366 1 1.022432
## DaysPhysHlthBad 1.247355 1 1.116851
## factor(HealthGen) 1.455605 4 1.048046
## PhysActive 1.166136 1 1.079878
From the VIF values in the output above, once again we do not observe any potential collinearity issues. In fact, the VIF values are fairly small: none of the values exceed 10.
MLR & Diagnosis with interaction ########################################
model_4 add additional risk factors ##
df3$logBMI = log(df3$BMI + 1)
m_full = lm(
logBMI ~ SleepHrsNight + DaysMentHlthBad+ Age + Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 +
DaysPhysHlthBad + factor(HealthGen) + PhysActive + SleepHrsNight*Age + SleepHrsNight*Gender,
df3
)
summary(m_full)
##
## Call:
## lm(formula = logBMI ~ SleepHrsNight + DaysMentHlthBad + Age +
## Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve +
## AlcoholYear + Smoke100 + UrineFlow1 + DaysPhysHlthBad + factor(HealthGen) +
## PhysActive + SleepHrsNight * Age + SleepHrsNight * Gender,
## data = df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6234 -0.1294 -0.0049 0.1206 0.8062
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3688443 0.0964199 34.939 < 2e-16 ***
## SleepHrsNight -0.0177956 0.0115858 -1.536 0.124685
## DaysMentHlthBad -0.0012489 0.0005503 -2.269 0.023341 *
## Age -0.0033337 0.0019138 -1.742 0.081656 .
## Gender 0.1152578 0.0440267 2.618 0.008907 **
## factor(Race1)2 -0.0485063 0.0194505 -2.494 0.012709 *
## factor(Race1)3 -0.0256227 0.0170349 -1.504 0.132690
## factor(Race1)4 -0.0431426 0.0129089 -3.342 0.000845 ***
## factor(Race1)5 -0.0930094 0.0190114 -4.892 1.07e-06 ***
## Poverty 0.0028324 0.0027972 1.013 0.311366
## TotChol 0.0033009 0.0041444 0.796 0.425845
## BPDiaAve 0.0019674 0.0004209 4.674 3.13e-06 ***
## BPSysAve 0.0015546 0.0003532 4.401 1.13e-05 ***
## AlcoholYear -0.0002695 0.0000462 -5.833 6.23e-09 ***
## Smoke100 -0.0299332 0.0088214 -3.393 0.000703 ***
## UrineFlow1 -0.0027372 0.0043481 -0.630 0.529077
## DaysPhysHlthBad 0.0006245 0.0006386 0.978 0.328195
## factor(HealthGen)2 -0.0860758 0.0306348 -2.810 0.005002 **
## factor(HealthGen)3 -0.1279830 0.0303267 -4.220 2.54e-05 ***
## factor(HealthGen)4 -0.1808754 0.0311307 -5.810 7.14e-09 ***
## factor(HealthGen)5 -0.2476517 0.0328054 -7.549 6.35e-14 ***
## PhysActive -0.0231446 0.0090319 -2.563 0.010456 *
## SleepHrsNight:Age 0.0005509 0.0002746 2.006 0.044979 *
## SleepHrsNight:Gender -0.0166893 0.0063397 -2.632 0.008535 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.195 on 2223 degrees of freedom
## Multiple R-squared: 0.1587, Adjusted R-squared: 0.15
## F-statistic: 18.24 on 23 and 2223 DF, p-value: < 2.2e-16
car::Anova(m_full, type = "III")
Confint(m_full)
## Estimate 2.5 % 97.5 %
## (Intercept) 3.3688442791 3.179762e+00 3.5579268348
## SleepHrsNight -0.0177956336 -4.051579e-02 0.0049245242
## DaysMentHlthBad -0.0012489159 -2.328143e-03 -0.0001696883
## Age -0.0033336861 -7.086650e-03 0.0004192774
## Gender 0.1152578004 2.892000e-02 0.2015956012
## factor(Race1)2 -0.0485062616 -8.664922e-02 -0.0103633015
## factor(Race1)3 -0.0256226809 -5.902864e-02 0.0077832799
## factor(Race1)4 -0.0431425559 -6.845724e-02 -0.0178278742
## factor(Race1)5 -0.0930094175 -1.302913e-01 -0.0557275358
## Poverty 0.0028324405 -2.653000e-03 0.0083178808
## TotChol 0.0033008956 -4.826430e-03 0.0114282217
## BPDiaAve 0.0019673653 1.141908e-03 0.0027928229
## BPSysAve 0.0015546299 8.619339e-04 0.0022473259
## AlcoholYear -0.0002694855 -3.600834e-04 -0.0001788876
## Smoke100 -0.0299332383 -4.723231e-02 -0.0126341685
## UrineFlow1 -0.0027371557 -1.126381e-02 0.0057895026
## DaysPhysHlthBad 0.0006245193 -6.277692e-04 0.0018768079
## factor(HealthGen)2 -0.0860758400 -1.461517e-01 -0.0259999717
## factor(HealthGen)3 -0.1279829527 -1.874546e-01 -0.0685113157
## factor(HealthGen)4 -0.1808753562 -2.419237e-01 -0.1198269982
## factor(HealthGen)5 -0.2476517313 -3.119841e-01 -0.1833193210
## PhysActive -0.0231445878 -4.085637e-02 -0.0054328082
## SleepHrsNight:Age 0.0005509040 1.234621e-05 0.0010894618
## SleepHrsNight:Gender -0.0166893149 -2.912172e-02 -0.0042569113
model 4 diagnosis ###########
par(mfrow = c(2, 3)) #read more from ?plot.lm
plot(m_full, which = 1)
plot(m_full, which = 2)
plot(m_full, which = 3)
plot(m_full, which = 4)
plot(m_full, which = 5)
plot(m_full, which = 6)
par(mfrow = c(1, 1)) # reset
m_full.yhat = m_full$fitted.values
m_full.res = m_full$residuals
m_full.h = hatvalues(m_full)
m_full.r = rstandard(m_full)
m_full.rr = rstudent(m_full)
which subject is most outlying with respect to the x space
Hmisc::describe(m_full.h)
## m_full.h
## n missing distinct Info Mean Gmd .05 .10
## 2247 0 2247 1 0.01068 0.005784 0.004624 0.005195
## .25 .50 .75 .90 .95
## 0.006784 0.009383 0.012802 0.017108 0.021401
##
## lowest : 0.002841614 0.002995724 0.003127421 0.003231272 0.003285363
## highest: 0.046156742 0.046910462 0.048836799 0.049628556 0.060943543
m_full.h[which.max(m_full.h)]
## 422
## 0.06094354
Assumption:LINE ##############################
(1)Linear: 2 approaches
partial regression plots
car::avPlots(m_full)
(2)Independence:
residuals <- resid(m_full)
acf(residuals, main = "Autocorrelation Function of Residuals")
pacf(residuals, main = "Partial Autocorrelation Function of Residuals")
dw_test <- dwtest(m_full)
print(dw_test)
##
## Durbin-Watson test
##
## data: m_full
## DW = 2.0126, p-value = 0.6159
## alternative hypothesis: true autocorrelation is greater than 0
(3)E: constant var: residuals-fitted values; transform for variance-stable…(total: 4 solutions)
car::residualPlots(m_full, type = "response")
## Test stat Pr(>|Test stat|)
## SleepHrsNight -0.2860 0.77489
## DaysMentHlthBad -0.3074 0.75854
## Age -4.0272 5.835e-05 ***
## Gender -0.2921 0.77024
## factor(Race1)
## Poverty -0.9911 0.32176
## TotChol -1.8770 0.06065 .
## BPDiaAve 0.5684 0.56985
## BPSysAve -4.3959 1.155e-05 ***
## AlcoholYear 2.0151 0.04401 *
## Smoke100 0.5031 0.61493
## UrineFlow1 -0.3027 0.76215
## DaysPhysHlthBad 0.9147 0.36044
## factor(HealthGen)
## PhysActive 0.1306 0.89611
## Tukey test -1.2992 0.19389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(m_full, which = 1)
or
ggplot(m_full, aes(x = m_full.yhat, y = m_full.res)) +
geom_point(color = "blue", alpha = 0.8) +
geom_hline(yintercept = 0,
linetype = "dashed",
color = "black") +
labs(title = "constant variance assumption",
x = "y hat",
y = "Residuals") +
theme_minimal()
conclusion: the constant variance assumption is basically not violated. The spread of the residuals appears to be fairly uniform across the range of predicted values, the assumption is more likely to hold
(4)Normality: residuals freq - residuals (4 plots: his, box, Q-Q, shapiro); transform
exam quartiles of the residuals
Hmisc::describe(m_full.res)
## m_full.res
## n missing distinct Info Mean Gmd .05 .10
## 2247 0 2247 1 1.652e-18 0.2165 -0.302479 -0.240208
## .25 .50 .75 .90 .95
## -0.129418 -0.004903 0.120596 0.244933 0.328033
##
## lowest : -0.6233869 -0.6167844 -0.6131555 -0.5549558 -0.5291444
## highest: 0.7058034 0.7180306 0.7472944 0.7760889 0.8062185
Hmisc::describe(m_full.res)$counts[c(".25", ".50", ".75")] #not symmetric
## .25 .50 .75
## "-0.129418" "-0.004903" " 0.120596"
histogram
par(mfrow = c(1, 1))
hist(m_full.res, breaks = 15)
Q-Q plot
qq.m_full.res = car::qqPlot(m_full.res)
m_full.res[qq.m_full.res]
## 910 1943
## 0.8062185 0.7760889
influential observations #################
influence4 = data.frame(
Residual = resid(m_full),
Rstudent = rstudent(m_full),
HatDiagH = hat(model.matrix(m_full)),
CovRatio = covratio(m_full),
DFFITS = dffits(m_full),
COOKsDistance = cooks.distance(m_full)
)
DFFITS
ols_plot_dffits(m_full)
influence4[order(abs(influence4$DFFITS), decreasing = T), ] %>% head()
From the plot above, we can see 2 observations with the largest (magnitude) of DFFITS, observation 879 and 1769 By printing the corresponding values of DFFITS in the output dataset, we can obtain their DFFITS values: 0.5673 for observation 879, 0.5872 for observation 1769
Cook’s D
ols_plot_cooksd_bar(m_full)
influence4[order(influence4$COOKsDistance,decreasing = T),] %>% head()
From the plot above, there is 7 observations(1048,1769,1684, 74, 72, 1689, 1311) located in the intersection areas of both outlier and leverage, which is to say, those observations has both the leverage and the externally studentized residual exceeding their respective thresholds.Due to its large DIFFITS and Cook’s D, they are potentially influential observations.
The thresholds for the externally studentized residual are -2 and 2, i.e. 2 in magnitude. The thresholds for the leverage of the R default is 0.011
leverage
ols_plot_resid_lev(m_full)
high leverage
influence4[order(influence4$HatDiagH,decreasing = T),] %>% head()
high studentized residual
influence4[order(influence4$Rstudent,decreasing = T),] %>% head()
From (DFFITS), observations 879 and 1769 appear to be influential observations. Observation 1155 has extraordinarily large leverage. Therefore, I choose to remove these 14 observations in the re-fitted mode
rm4.df3 = df3[-c(1048, 1769, 1684, 74, 72, 1689, 1311, 879), ]
rm.m_full = lm(
logBMI ~ SleepHrsNight + Age + Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
DaysPhysHlthBad + factor(HealthGen) + PhysActive + SleepHrsNight*Age + SleepHrsNight*Gender,
rm4.df3
)
Before removing these observations, the estimated coefficients are:
summary(m_full)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3688442791 9.641994e-02 34.9392910 1.463092e-213
## SleepHrsNight -0.0177956336 1.158582e-02 -1.5359841 1.246846e-01
## DaysMentHlthBad -0.0012489159 5.503366e-04 -2.2693673 2.334143e-02
## Age -0.0033336861 1.913770e-03 -1.7419471 8.165600e-02
## Gender 0.1152578004 4.402672e-02 2.6179055 8.907161e-03
## factor(Race1)2 -0.0485062616 1.945046e-02 -2.4938368 1.270922e-02
## factor(Race1)3 -0.0256226809 1.703489e-02 -1.5041295 1.326901e-01
## factor(Race1)4 -0.0431425559 1.290886e-02 -3.3420890 8.452748e-04
## factor(Race1)5 -0.0930094175 1.901136e-02 -4.8923072 1.068055e-06
## Poverty 0.0028324405 2.797222e-03 1.0125906 3.113659e-01
## TotChol 0.0033008956 4.144413e-03 0.7964687 4.258447e-01
## BPDiaAve 0.0019673653 4.209303e-04 4.6738508 3.132952e-06
## BPSysAve 0.0015546299 3.532304e-04 4.4011781 1.127568e-05
## AlcoholYear -0.0002694855 4.619909e-05 -5.8331340 6.234466e-09
## Smoke100 -0.0299332383 8.821413e-03 -3.3932477 7.027982e-04
## UrineFlow1 -0.0027371557 4.348047e-03 -0.6295138 5.290774e-01
## DaysPhysHlthBad 0.0006245193 6.385866e-04 0.9779712 3.281950e-01
## factor(HealthGen)2 -0.0860758400 3.063483e-02 -2.8097380 5.001570e-03
## factor(HealthGen)3 -0.1279829527 3.032671e-02 -4.2201399 2.539608e-05
## factor(HealthGen)4 -0.1808753562 3.113073e-02 -5.8101859 7.137490e-09
## factor(HealthGen)5 -0.2476517313 3.280539e-02 -7.5491171 6.354907e-14
## PhysActive -0.0231445878 9.031868e-03 -2.5625472 1.045608e-02
## SleepHrsNight:Age 0.0005509040 2.746298e-04 2.0059875 4.497853e-02
## SleepHrsNight:Gender -0.0166893149 6.339726e-03 -2.6324978 8.534622e-03
After removing these observations, the estimated coefficients are:
summary(rm.m_full)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3680640105 9.653194e-02 34.8906707 6.115538e-213
## SleepHrsNight -0.0176701164 1.160014e-02 -1.5232675 1.278346e-01
## Age -0.0033187735 1.916556e-03 -1.7316340 8.347797e-02
## Gender 0.1173024056 4.416962e-02 2.6557260 7.970518e-03
## factor(Race1)2 -0.0497356294 1.946971e-02 -2.5545132 1.070000e-02
## factor(Race1)3 -0.0261341401 1.706844e-02 -1.5311386 1.258779e-01
## factor(Race1)4 -0.0448559732 1.294335e-02 -3.4655616 5.391989e-04
## factor(Race1)5 -0.0941809010 1.903251e-02 -4.9484215 8.045103e-07
## Poverty 0.0029605378 2.804053e-03 1.0558065 2.911717e-01
## TotChol 0.0033426614 4.155265e-03 0.8044400 4.212293e-01
## BPDiaAve 0.0019915617 4.218504e-04 4.7210139 2.493370e-06
## BPSysAve 0.0015362161 3.539458e-04 4.3402580 1.487146e-05
## AlcoholYear -0.0002697270 4.626138e-05 -5.8305013 6.335210e-09
## Smoke100 -0.0297821540 8.841691e-03 -3.3683777 7.690604e-04
## UrineFlow1 -0.0028886485 4.392151e-03 -0.6576842 5.108094e-01
## DaysMentHlthBad -0.0011669861 5.544332e-04 -2.1048274 3.541871e-02
## DaysPhysHlthBad 0.0006043782 6.395009e-04 0.9450779 3.447222e-01
## factor(HealthGen)2 -0.0867078232 3.066055e-02 -2.8279933 4.726192e-03
## factor(HealthGen)3 -0.1270707356 3.034738e-02 -4.1872055 2.934592e-05
## factor(HealthGen)4 -0.1799745606 3.115368e-02 -5.7769919 8.676582e-09
## factor(HealthGen)5 -0.2467543622 3.282786e-02 -7.5166149 8.111908e-14
## PhysActive -0.0238843563 9.046253e-03 -2.6402486 8.342681e-03
## SleepHrsNight:Age 0.0005506386 2.750729e-04 2.0017913 4.542893e-02
## SleepHrsNight:Gender -0.0169358141 6.359509e-03 -2.6630694 7.799202e-03
change percent
abs((rm.m_full$coefficients - m_full$coefficients) / (m_full$coefficients) * 100)
## (Intercept) SleepHrsNight Age
## 2.316131e-02 7.053254e-01 1.657323e+02
## Gender factor(Race1)2 factor(Race1)3
## 3.618700e+03 1.431516e+02 4.612213e+01
## factor(Race1)4 factor(Race1)5 Poverty
## 7.506354e+01 1.183016e+02 1.031831e+02
## TotChol BPDiaAve BPSysAve
## 1.801347e+01 3.966602e+01 2.191505e+01
## AlcoholYear Smoke100 UrineFlow1
## 1.173499e+02 1.095149e+04 9.034970e+01
## DaysMentHlthBad DaysPhysHlthBad factor(HealthGen)2
## 5.736501e+01 3.225060e+00 7.342166e-01
## factor(HealthGen)3 factor(HealthGen)4 factor(HealthGen)5
## 7.127646e-01 4.980201e-01 3.623512e-01
## PhysActive SleepHrsNight:Age SleepHrsNight:Gender
## 3.196291e+00 4.817988e-02 1.476988e+00
The estimated regression coefficients doesn’t change slightly after removing these observations. 5 of the estimates have changed by more than 10% after calculation. The p-value for the coefficient forSleepHrsNight is becoming insignificant with 95% confidence level.
multicollinearity ######################
Pearson correlations
var4 = c(
"BMI",
"SleepHrsNight",
"Age",
"Gender",
"Race1",
"TotChol",
"BPDiaAve",
"BPSysAve",
"AlcoholYear",
"Smoke100",
"DaysPhysHlthBad",
"PhysActive",
"Poverty",
"UrineFlow1",
"DaysMentHlthBad",
"HealthGen"
)
newData4 = df3[, var4]
library("corrplot")
par(mfrow = c(1, 2))
cormat4 = cor(as.matrix(newData4[, -c(1)], method = "pearson"))
p.mat4 = cor.mtest(as.matrix(newData4[, -c(1)]))$p
corrplot(
cormat4,
method = "color",
type = "upper",
number.cex = 1,
diag = FALSE,
addCoef.col = "black",
tl.col = "black",
tl.srt = 90,
p.mat = p.mat4,
sig.level = 0.05,
insig = "blank",
)
None of the covariates seem strongly correlated.There is no evidence of collinearity from the pair-wise correlations.
collinearity diagnostics (VIF)
car::vif(m_full)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## GVIF Df GVIF^(1/(2*Df))
## SleepHrsNight 13.534700 1 3.678954
## DaysMentHlthBad 1.145332 1 1.070202
## Age 28.092544 1 5.300240
## Gender 28.493511 1 5.337931
## factor(Race1) 1.248932 4 1.028176
## Poverty 1.325671 1 1.151378
## TotChol 1.132169 1 1.064034
## BPDiaAve 1.445314 1 1.202212
## BPSysAve 1.562733 1 1.250093
## AlcoholYear 1.123289 1 1.059853
## Smoke100 1.141471 1 1.068397
## UrineFlow1 1.046493 1 1.022982
## DaysPhysHlthBad 1.248178 1 1.117219
## factor(HealthGen) 1.465793 4 1.048960
## PhysActive 1.173484 1 1.083275
## SleepHrsNight:Age 37.290593 1 6.106602
## SleepHrsNight:Gender 30.038031 1 5.480696
From the VIF values in the output above, once again we do not observe any potential collinearity issues. In fact, the VIF values are fairly small: none of the values exceed 10.
getMode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
new_data <- expand.grid(SleepHrsNight = seq(min(df3$SleepHrsNight), max(df3$SleepHrsNight), length.out = 100),
Age = quantile(df3$Age, probs = c(0.25, 0.5, 0.75)),
Gender = median(df3$Gender, na.rm = TRUE),
Race1 = median(df3$Race1, na.rm = TRUE),
Poverty = median(df3$Poverty, na.rm = TRUE),
TotChol = median(df3$TotChol, na.rm = TRUE),
BPDiaAve = median(df3$BPDiaAve, na.rm = TRUE),
BPSysAve = median(df3$BPSysAve, na.rm = TRUE),
AlcoholYear = median(df3$AlcoholYear, na.rm = TRUE),
Smoke100 = getMode(df3$Smoke100),
UrineFlow1 = median(df3$UrineFlow1, na.rm = TRUE),
DaysMentHlthBad = median(df3$DaysMentHlthBad, na.rm = TRUE),
DaysPhysHlthBad = median(df3$DaysPhysHlthBad, na.rm = TRUE),
HealthGen = getMode(df3$HealthGen),
PhysActive = getMode(df3$PhysActive)
)
predict
new_data$predicted_BMI <- predict(m_full, newdata = new_data)
interaction
library(ggplot2)
ggplot(new_data, aes(x = SleepHrsNight, y = predicted_BMI, group = factor(Age))) +
geom_line(aes(color = factor(Age))) +
labs(title = "Interaction between Sleep Hours and Age on BMI",
x = "Sleep Hours per Night",
y = "Predicted BMI")
library(ggplot2)
ggplot(new_data, aes(x = SleepHrsNight, y = predicted_BMI, group = factor(Gender))) +
geom_line(aes(color = factor(Gender))) +
labs(title = "Interaction between Sleep Hours and Gender on BMI",
x = "Sleep Hours per Night",
y = "Predicted BMI")
cross validation
library(caret)
## Loading required package: lattice
splitIndex <-
createDataPartition(df3$SleepHrsNight, p = 0.7, list = FALSE)
trainData <- df3[splitIndex, ]
testData <- df3[-splitIndex, ]
predictions <- predict(m_full, newdata = testData)
mse <- mean((testData$SleepHrsNight - predictions) ^ 2)
control <-
trainControl(method = "cv", number = 10) # 10-fold cross-validation
cv_model <-
train(
SleepHrsNight ~ .,
data = df3,
method = "lm",
trControl = control
)
cv_model
## Linear Regression
##
## 2247 samples
## 16 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2022, 2022, 2021, 2022, 2023, 2022, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 1.276784 0.05185099 0.9954254
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
(cv_results <- cv_model$results)